Topic modelling using gensim

Imports

Import dependencies


In [1]:
%%bash
ls | grep .csv


emails.csv
emails.csv.zip

In [2]:
# built-in libs
import email

# processing libs
import pandas as pd

# display libs
from tqdm import tqdm_notebook

Import data


In [3]:
emails_full_df = pd.read_csv('emails.csv', chunksize=10000)
emails_df = next(emails_full_df)

In [4]:
print(emails_df.shape)
emails_df.head()


(10000, 2)
Out[4]:
file message
0 allen-p/_sent_mail/1. Message-ID: <18782981.1075855378110.JavaMail.e...
1 allen-p/_sent_mail/10. Message-ID: <15464986.1075855378456.JavaMail.e...
2 allen-p/_sent_mail/100. Message-ID: <24216240.1075855687451.JavaMail.e...
3 allen-p/_sent_mail/1000. Message-ID: <13505866.1075863688222.JavaMail.e...
4 allen-p/_sent_mail/1001. Message-ID: <30922949.1075863688243.JavaMail.e...

In [5]:
emails_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
file       10000 non-null object
message    10000 non-null object
dtypes: object(2)
memory usage: 156.3+ KB

In [6]:
%time
messages_obj_lst = []
messages_str_lst = []

message_metadata = {}

for i in tqdm_notebook(range(emails_df.shape[0])):
    msg = email.message_from_string(emails_df.message[i])
    
    for msg_property in msg:
        if msg_property in message_metadata:
            message_metadata[msg_property][i] = msg[msg_property]
        else:
            message_metadata[msg_property] = ['N/A'] * emails_df.shape[0]
    
    payload = msg.get_payload() # decode=True
    
    messages_obj_lst.append(msg)
    messages_str_lst.append(payload) #.encode('utf-8').decode('unicode_escape')
    #except KeyboardInterrupt:
    #    break

print('messages_obj_lst size: %i' % len(messages_obj_lst))


CPU times: user 7 µs, sys: 2 µs, total: 9 µs
Wall time: 17.4 µs
messages_obj_lst size: 10000

In [7]:
# update dataframe object
# emails_df.rename(columns = {'message':'message_obj'}, inplace = True)
emails_df = emails_df.assign(message_obj = pd.Series(messages_obj_lst).values)
emails_df = emails_df.assign(payload     = pd.Series(messages_str_lst).values)

# print(emails_df.payload.str.contains(r'\\'))
emails_df['payload'] = emails_df.payload.str.replace(r'\n', '')

In [8]:
emails_df.head()


Out[8]:
file message message_obj payload
0 allen-p/_sent_mail/1. Message-ID: <18782981.1075855378110.JavaMail.e... [Message-ID, Date, From, To, Subject, Mime-Ver... Here is our forecast
1 allen-p/_sent_mail/10. Message-ID: <15464986.1075855378456.JavaMail.e... [Message-ID, Date, From, To, Subject, Mime-Ver... Traveling to have a business meeting takes the...
2 allen-p/_sent_mail/100. Message-ID: <24216240.1075855687451.JavaMail.e... [Message-ID, Date, From, To, Subject, Mime-Ver... test successful. way to go!!!
3 allen-p/_sent_mail/1000. Message-ID: <13505866.1075863688222.JavaMail.e... [Message-ID, Date, From, To, Subject, Mime-Ver... Randy, Can you send me a schedule of the salar...
4 allen-p/_sent_mail/1001. Message-ID: <30922949.1075863688243.JavaMail.e... [Message-ID, Date, From, To, Subject, Mime-Ver... Let's shoot for Tuesday at 11:45.

In [9]:
for i in range(50):
    print(emails_df.message_obj[i]['Subject'])


Re:
Re: test

Re: Hello
Re: Hello

Re: PRC review - phone calls
Re: High Speed Internet Access
FW: fixed forward or other Collar floor gas price terms
Re: FW: fixed forward or other Collar floor gas price terms

Consolidated positions: Issues & To Do list
Consolidated positions: Issues & To Do list

Re: 2001 Margin Plan
Var, Reporting and Resources Meeting

Westgate
Meeting re: Storage Strategies in the West

Re: Not business related..
Re: Original Sept check/closing
San Juan Index
San Juan Index
Investment Structure
Investment Structure


Re: Gas Trading Vision meeting

Gas Physical/Financial Position
closing








Westgate Proforma-Phillip Allen.xls


Re: burnet

Re: Sept 1 Payment

Contact list for mid market


In [10]:
del messages_obj_lst
del messages_str_lst

emails_df.drop('message', axis=1, inplace=True)

In [ ]:


In [11]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [12]:
train = emails_df[:7000]
test = emails_df[7000:]

In [13]:
trainheadlines = []
for row in range(0,len(train.index)):
    trainheadlines.append(emails_df.message_obj[row]['Subject']) # ' '.join(str(x))

trainheadlines = list(filter(None, trainheadlines))

[row for row in trainheadlines[:10]]


Out[13]:
['Re:',
 'Re: test',
 'Re: Hello',
 'Re: Hello',
 'Re: PRC review - phone calls',
 'Re: High Speed Internet Access',
 'FW: fixed forward or other Collar floor gas price terms',
 'Re: FW: fixed forward or other Collar floor gas price terms',
 'Consolidated positions: Issues & To Do list',
 'Consolidated positions: Issues & To Do list']

In [14]:
# trainvect = CountVectorizer()
# Trainfeature = trainvect.fit_transform(trainheadlines)

In [15]:
# ####Detailed view of Document Count Matrix
# DTM_With_Colm = pd.DataFrame(Trainfeature.toarray(),columns= trainvect.get_feature_names())

In [16]:
# Trainfeature.shape

In [17]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

In [18]:
%time
tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
#Our Document
trainheadlines

# list for tokenized documents in loop
texts = []

# loop through document list
for i in trainheadlines:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]


CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.53 µs

In [19]:
%time
#generate LDA
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=1,chunksize=10000,update_every=1)


CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs

In [20]:
%time
import pyLDAvis.gensim
print(ldamodel.print_topics(num_topics=10, num_words=3))


CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs
[(0, '0.180*"re" + 0.014*"ng" + 0.011*"credit"'), (1, '0.034*"re" + 0.028*"fw" + 0.020*"call"'), (2, '0.020*"meet" + 0.019*"ga" + 0.017*"fw"'), (3, '0.102*"re" + 0.033*"ga" + 0.023*"trade"'), (4, '0.162*"re" + 0.014*"meet" + 0.010*"market"'), (5, '0.057*"enron" + 0.051*"re" + 0.030*"mention"'), (6, '0.220*"re" + 0.015*"10" + 0.014*"fw"'), (7, '0.093*"re" + 0.020*"request" + 0.018*"com"'), (8, '0.093*"re" + 0.019*"01" + 0.017*"enron"'), (9, '0.050*"re" + 0.020*"fw" + 0.016*"new"')]

In [21]:
ldamodel.print_topics(5)


Out[21]:
[(0,
  '0.180*"re" + 0.014*"ng" + 0.011*"credit" + 0.010*"fw" + 0.008*"view" + 0.008*"ga" + 0.007*"extra" + 0.006*"s" + 0.006*"execut" + 0.006*"request"'),
 (1,
  '0.034*"re" + 0.028*"fw" + 0.020*"call" + 0.013*"ga" + 0.012*"confer" + 0.012*"30" + 0.012*"10" + 0.011*"western" + 0.010*"meet" + 0.010*"12"'),
 (6,
  '0.220*"re" + 0.015*"10" + 0.014*"fw" + 0.010*"weekli" + 0.010*"daili" + 0.009*"2001" + 0.009*"chart" + 0.009*"ng" + 0.008*"survey" + 0.008*"p"'),
 (2,
  '0.020*"meet" + 0.019*"ga" + 0.017*"fw" + 0.016*"10" + 0.015*"re" + 0.015*"2001" + 0.014*"enron" + 0.013*"01" + 0.013*"report" + 0.011*"market"'),
 (3,
  '0.102*"re" + 0.033*"ga" + 0.023*"trade" + 0.016*"fw" + 0.016*"2001" + 0.012*"daili" + 0.011*"west" + 0.009*"meet" + 0.009*"chart" + 0.009*"updat"')]

In [22]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
news = pyLDAvis.gensim.prepare(ldamodel,corpus, dictionary)


/usr/local/lib/python3.5/dist-packages/pyLDAvis/_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  return pd.concat([default_term_info] + list(topic_dfs))

In [23]:
news


Out[23]:

In [ ]:


In [24]:
# %%bash
# nvidia-smi

In [ ]:


Bibliography

  • pip3 install stop-words
  • pip3 install pyLDAvis

In [ ]: